huggingface · fxmarty · Jul 31, 2024
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
@@ -790,11 +790,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask

diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -1433,11 +1433,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask

diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
@@ -903,6 +903,7 @@ def forward(
             attentions=all_self_attns,
         )
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -911,11 +912,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1136,7 +1132,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1163,11 +1163,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1407,7 +1402,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
@@ -922,6 +922,7 @@ def forward(
             attentions=all_self_attns,
         )
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
     def _update_causal_mask(
         self,
         attention_mask: torch.Tensor,
@@ -930,11 +931,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask

diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -1135,11 +1135,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -1044,11 +1044,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1269,7 +1264,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1117,11 +1117,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask

diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
@@ -957,11 +957,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1180,7 +1175,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -769,11 +769,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -997,7 +992,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
@@ -1053,11 +1053,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1282,7 +1277,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
@@ -1096,11 +1096,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1325,7 +1320,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -955,11 +955,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1177,7 +1172,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device

diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -1128,11 +1128,6 @@ def _update_causal_mask(
         past_key_values: Cache,
         output_attentions: bool,
     ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
-        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
-        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
-        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
@@ -1373,7 +1368,13 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        # Prepare a 4D attention mask here so as to have inputs statically shaped when using StaticCache (torch.compile compatibility).
+        # `flash_attention_2` attention implementation requires a 2D attention mask.
+        if (
+            isinstance(past_key_values, StaticCache)
+            and attention_mask.ndim == 2
+            and self.config._attn_implementation != "flash_attention_2"
+        ):
             if inputs_embeds is not None:
                 batch_size, sequence_length = inputs_embeds.shape
                 device = inputs_embeds.device