adding use case of interpolate_pos = False and respective testing to …

…all models
huggingface · Aug 4, 2024 · db5a20a · db5a20a
1 parent 0cd8a8f
commit db5a20a
Show file tree

Hide file tree

Showing 12 changed files with 132 additions and 608 deletions.
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
@@ -751,7 +751,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         causal_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
         bsz, tgt_len, embed_dim = hidden_states.size()
@@ -840,6 +840,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->AltCLIP
 class AltCLIPEncoderLayer(nn.Module):
     def __init__(self, config: AltCLIPConfig):
         super().__init__()
@@ -890,6 +891,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->AltCLIP
 class AltCLIPEncoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
@@ -1047,6 +1049,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
     def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
         batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+            )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
@@ -1117,6 +1123,7 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer with CLIPVisionTransformer->AltCLIPVisionTransformer,CLIPVisionConfig->AltCLIPVisionConfig,CLIPVisionEmbeddings->AltCLIPVisionEmbeddings,CLIPEncoder->AltCLIPEncoder,CLIP_VISION_INPUTS_DOCSTRING->ALTCLIP_VISION_INPUTS_DOCSTRING
 class AltCLIPVisionTransformer(nn.Module):
     def __init__(self, config: AltCLIPVisionConfig):
         super().__init__()
@@ -1508,12 +1515,12 @@ def __init__(self, config: AltCLIPConfig):
         super().__init__(config)
 
         if not isinstance(config.vision_config, AltCLIPVisionConfig):
-            raise TypeError(
+            raise ValueError(
                 "config.vision_config is expected to be of type AltCLIPVisionConfig but is of type"
                 f" {type(config.vision_config)}."
             )
         if not isinstance(config.text_config, AltCLIPTextConfig):
-            raise TypeError(
+            raise ValueError(
                 "config.text_config is expected to be of type AltCLIPTextConfig but is of type"
                 f" {type(config.text_config)}."
             )

diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -314,6 +314,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
     def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
         batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+            )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -224,6 +224,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
 
     def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
         batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+            )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
@@ -1385,13 +1389,13 @@ def __init__(self, config: ChineseCLIPConfig):
         super().__init__(config)
 
         if not isinstance(config.text_config, ChineseCLIPTextConfig):
-            raise TypeError(
+            raise ValueError(
                 "config.text_config is expected to be of type ChineseCLIPTextConfig but is of type"
                 f" {type(config.text_config)}."
             )
 
         if not isinstance(config.vision_config, ChineseCLIPVisionConfig):
-            raise TypeError(
+            raise ValueError(
                 "config.vision_config is expected to be of type ChineseCLIPVisionConfig but is of type"
                 f" {type(config.vision_config)}."
             )