From 63e8a34947005148f6ed1854145a887f09d4f0d0 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Sun, 11 Aug 2024 19:21:33 +0200 Subject: [PATCH 01/49] adding positional encoder changes and tests --- .../models/altclip/modeling_altclip.py | 58 +++++++++++++- .../bridgetower/modeling_bridgetower.py | 75 ++++++++++++++++--- .../chinese_clip/modeling_chinese_clip.py | 60 ++++++++++++++- src/transformers/models/clip/modeling_clip.py | 64 +++++++++++++++- src/transformers/models/git/modeling_git.py | 43 ++++++++++- .../models/kosmos2/modeling_kosmos2.py | 58 +++++++++++++- .../models/x_clip/modeling_x_clip.py | 57 +++++++++++++- tests/models/altclip/test_modeling_altclip.py | 36 +++++++++ .../bridgetower/test_modeling_bridgetower.py | 36 +++++++++ .../test_modeling_chinese_clip.py | 38 ++++++++++ tests/models/clip/test_modeling_clip.py | 37 +++++++++ tests/models/kosmos2/test_modeling_kosmos2.py | 37 +++++++++ tests/models/x_clip/test_modeling_x_clip.py | 38 ++++++++++ 13 files changed, 604 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index f9856ef701f9..7186afb28b1f 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -100,6 +100,8 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding(`bool`, defaults to `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -1009,15 +1011,56 @@ def __init__(self, config: AltCLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -1097,6 +1140,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -1111,7 +1155,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1156,6 +1200,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1186,6 +1231,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1546,6 +1592,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1578,6 +1625,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1598,6 +1646,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, AltCLIPOutput]: r""" @@ -1642,6 +1691,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 91cbda9b72ed..2075ca007439 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -111,6 +111,8 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, defaults to `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -276,15 +278,56 @@ def __init__(self, config: BridgeTowerVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -302,8 +345,13 @@ def __init__(self, config): [nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) for _ in range(config.num_hidden_layers)] ) - def forward(self, pixel_values: torch.Tensor, attention_mask): - hidden_states = self.embeddings(pixel_values) + def forward( + self, + pixel_values: torch.Tensor, + attention_mask, + interpolate_pos_encoding: bool = False, + ): + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding) hidden_states = self.ln_pre(hidden_states) # NLD -> LND hidden_states = hidden_states.permute(1, 0, 2) @@ -324,8 +372,12 @@ def forward(self, pixel_values: torch.Tensor, attention_mask): hidden_states = torch.stack(hidden_states_stack, dim=0) return hidden_states - def forward_pre(self, pixel_values: torch.Tensor): - hidden_states = self.embeddings(pixel_values) + def forward_pre( + self, + pixel_values: torch.Tensor, + interpolate_pos_encoding: bool = False, + ): + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.ln_pre(hidden_states) # NLD -> LND hidden_states = hidden_states.permute(1, 0, 2) @@ -1015,8 +1067,8 @@ def __init__(self, config): def dtype(self): return self.visual.embeddings.patch_embedding.weight.dtype - def forward(self, image, image_mask=None): - return self.visual(image.type(self.dtype), image_mask) + def forward(self, image, image_mask=None, interpolate_pos_encoding=False): + return self.visual(image.type(self.dtype), image_mask, interpolate_pos_encoding) class BridgeTowerTextModel(BridgeTowerPreTrainedModel): @@ -1280,6 +1332,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, ) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]: r""" output_hidden_states (`bool`, *optional*): @@ -1352,7 +1405,9 @@ def forward( all_hidden_states_text += (text_embeds,) if image_embeds is None: - image_embeds = self.vision_model.visual.forward_pre(pixel_values.type(self.vision_model.dtype)) + image_embeds = self.vision_model.visual.forward_pre( + pixel_values.type(self.vision_model.dtype), interpolate_pos_encoding=interpolate_pos_encoding + ) else: # Permute as BridgeTowerResidualAttention has batch_first=True image_embeds = image_embeds.permute(1, 0, 2) diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 6fbd9459f5ad..9ae56a727ca7 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -188,15 +188,56 @@ def __init__(self, config: ChineseCLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -798,6 +839,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -813,6 +856,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -1052,6 +1097,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1066,7 +1112,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1299,6 +1345,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1329,6 +1376,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1425,6 +1473,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1461,6 +1510,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1481,6 +1531,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, ChineseCLIPOutput]: r""" @@ -1516,6 +1567,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index ee85fe312587..e6131f888582 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -14,6 +14,7 @@ # limitations under the License. """PyTorch CLIP model.""" +import math from dataclasses import dataclass from typing import Any, Optional, Tuple, Union @@ -185,15 +186,56 @@ def __init__(self, config: CLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -693,8 +735,11 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ CLIP_INPUTS_DOCSTRING = r""" @@ -730,6 +775,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -1012,6 +1059,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -1026,7 +1074,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1076,6 +1124,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1107,6 +1156,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, ) @@ -1203,6 +1253,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1238,6 +1289,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1257,6 +1309,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPOutput]: r""" @@ -1294,6 +1347,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1455,6 +1509,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPVisionModelOutput]: r""" @@ -1484,6 +1539,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 581f2b3947b4..f120896d11ec 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -624,15 +624,52 @@ def __init__(self, config: GitVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 69641790b2db..ca27fa3ace23 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -120,6 +120,8 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -258,6 +260,8 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -400,15 +404,56 @@ def __init__(self, config: Kosmos2VisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -700,6 +745,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -711,7 +757,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1442,6 +1488,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1452,6 +1499,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1768,6 +1816,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, Kosmos2ModelOutput]: r""" @@ -1819,6 +1868,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`. diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 791e501d1737..23618c2ed399 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -14,6 +14,7 @@ # limitations under the License. """PyTorch X-CLIP model.""" +import math from copy import copy from dataclasses import dataclass from typing import Any, Optional, Tuple, Union @@ -121,15 +122,56 @@ def __init__(self, config: XCLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -567,6 +609,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -604,6 +648,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -954,6 +1000,7 @@ def forward( pixel_values: torch.FloatTensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -966,7 +1013,7 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layernorm(hidden_states) encoder_outputs = self.encoder( @@ -1455,6 +1502,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, XCLIPOutput]: r""" @@ -1555,6 +1603,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 83b6d60595d3..ef713753aa54 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -597,3 +597,39 @@ def test_inference(self): expected_probs = torch.tensor([[9.9942e-01, 5.7805e-04]], device=torch_device) self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model_name = "BAAI/AltCLIP" + model = AltCLIPModel.from_pretrained(model_name).to(torch_device) + + image_processor = AltCLIPProcessor.from_pretrained( + model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 257, 1024)) + print("nilesh ") + print(outputs.vision_model_output.last_hidden_state.shape) + print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.5297, -0.7713, 0.4655], [0.8688, 0.1690, 0.6678], [1.1742, -0.7551, 0.0396]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index 44e6a404f623..cae36f151e44 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -656,3 +656,39 @@ def test_training(self): for name, param in model.named_parameters(): if self._is_layer_used(model_class, name): self.assertIsNotNone(param.grad, f"Gradients should not be None - got {param.grad} for {name}") + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model_name = "BridgeTower/bridgetower-base" + model = BridgeTowerModel.from_pretrained(model_name).to(torch_device) + + image_processor = BridgeTowerProcessor.from_pretrained( + model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 901, 768)) + + self.assertEqual(outputs.image_features.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.3433, 0.4557, -0.5287], [-0.7111, 0.6576, -1.0850], [-0.2122, 0.2021, -0.0536]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) \ No newline at end of file diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 7046f28b5f94..41dc1c16f9a8 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -740,3 +740,41 @@ def test_inference(self): expected_probs = torch.tensor([[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]], device=torch_device) self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model_name = "OFA-Sys/chinese-clip-vit-base-patch16" + model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device) + + image_processor = ChineseCLIPProcessor.from_pretrained( + model_name, size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 122, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.3374, 0.3212, -0.1293], [-0.2208, -0.6150, 0.7010], [-0.1901, -0.6576, 0.4843]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 3b6994428088..170a5baa9b4d 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1179,3 +1179,40 @@ def test_inference(self): expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device) self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = CLIPProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 6f34689004ef..0a2dc3c74d0d 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -762,3 +762,40 @@ def test_snowman_image_captioning_batch(self): self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0) self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0) self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) + + processor = AutoProcessor.from_pretrained( + "microsoft/kosmos-2-patch14-224", size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 257, 1024)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[1.4228, -1.9611, 3.8449], [3.4988, 2.0516, 0.3597], [3.1699, 0.2604, -0.4210]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 70e7bb341c7e..ddcedcb93236 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -731,3 +731,41 @@ def test_inference(self): expected_logits = torch.tensor([[14.0181, 20.2771, 14.4776]], device=torch_device) self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # XCLIP models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32").to(torch_device) + + processor = XCLIPProcessor.from_pretrained( + "microsoft/xclip-base-patch32", size=180, crop_size={"height": 180, "width": 180} + ) + + video = prepare_video() + inputs = processor(text="what's in the video", videos=video, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((8, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.1806, 0.3649, -0.0850], [0.0210, 0.3411, -0.0637], [0.2307, 0.3106, -0.2027]] + ).to(torch_device) + + print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file From bf6ddf27788776e303d668651aaaffdd3574c869 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Sun, 11 Aug 2024 23:53:19 +0200 Subject: [PATCH 02/49] adding ruff suggestions --- tests/models/bridgetower/test_modeling_bridgetower.py | 2 +- tests/models/chinese_clip/test_modeling_chinese_clip.py | 2 +- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/kosmos2/test_modeling_kosmos2.py | 2 +- tests/models/x_clip/test_modeling_x_clip.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index cae36f151e44..eee422ef9cb7 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -691,4 +691,4 @@ def test_inference_interpolate_pos_encoding(self): [[0.3433, 0.4557, -0.5287], [-0.7111, 0.6576, -1.0850], [-0.2122, 0.2021, -0.0536]] ).to(torch_device) - self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) \ No newline at end of file + self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 41dc1c16f9a8..18b14ead493c 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -777,4 +777,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 170a5baa9b4d..f49207e01ab3 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1215,4 +1215,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 0a2dc3c74d0d..1919ba09aa35 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -798,4 +798,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index ddcedcb93236..92c531fc9c21 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -768,4 +768,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) From c1e5058aff0558d25c09204c893b8ec0aa650b57 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 00:01:00 +0200 Subject: [PATCH 03/49] changes added by python utils/check_copies.py --fix_and_overwrite --- src/transformers/models/clipseg/modeling_clipseg.py | 3 ++- src/transformers/models/git/modeling_git.py | 4 ++++ tests/models/altclip/test_modeling_altclip.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 97fcf3d1f2b3..ed94e90ba994 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -825,6 +825,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -839,7 +840,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index f120896d11ec..32662f137e7a 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -660,6 +660,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index ef713753aa54..7444cfc9f145 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -632,4 +632,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) From 19aaa926dd4c29b1df51a54868fd4e65f06f662e Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 00:27:46 +0200 Subject: [PATCH 04/49] removing pos_encoding added by script --- src/transformers/models/clipseg/modeling_clipseg.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index ed94e90ba994..97fcf3d1f2b3 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -825,7 +825,6 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -840,7 +839,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.embeddings(pixel_values) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( From b2827968867b663c82787446f73d227f6b8e31e3 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 01:48:40 +0200 Subject: [PATCH 05/49] adding interpolation to clipseg --- .../models/clipseg/modeling_clipseg.py | 81 ++++++++++++------- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/clipseg/test_modeling_clipseg.py | 37 +++++++++ 3 files changed, 92 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 97fcf3d1f2b3..df71133115ef 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -163,40 +163,54 @@ def __init__(self, config: CLIPSegVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def interpolate_position_embeddings(self, new_size): - if len(new_size) != 2: - raise ValueError("new_size should consist of 2 values") - - num_patches_one_direction = int(self.num_patches**0.5) - # we interpolate the position embeddings in 2D - a = self.position_embedding.weight[1:].T.view( - 1, self.config.hidden_size, num_patches_one_direction, num_patches_one_direction - ) - b = ( - nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False) - .squeeze(0) - .view(self.config.hidden_size, new_size[0] * new_size[1]) - .T + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, ) - result = torch.cat([self.position_embedding.weight[:1], b]) - - return result - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - - if embeddings.shape[1] != self.num_positions: - new_shape = int(math.sqrt(embeddings.shape[1] - 1)) - embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape)) - embeddings = embeddings.to(embeddings.dtype) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) else: embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings @@ -512,6 +526,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -549,6 +565,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -825,6 +843,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -839,7 +858,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -884,6 +903,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -912,6 +932,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1005,6 +1026,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1040,6 +1062,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1059,6 +1082,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" @@ -1096,6 +1120,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1363,6 +1388,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" @@ -1402,6 +1428,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=True, # we need the intermediate hidden states + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) pooled_output = self.clip.visual_projection(vision_outputs[1]) diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index f49207e01ab3..1ae105264a33 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1182,7 +1182,7 @@ def test_inference(self): @slow def test_inference_interpolate_pos_encoding(self): - # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # CLIP models have an `interpolate_pos_encoding` argument in their forward method, # allowing to interpolate the pre-trained position embeddings in order to use # the model on higher resolutions. The DINO model by Facebook AI leverages this # to visualize self-attention on higher resolution images. diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index a6f286c4c6b7..0553284d7d94 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -813,3 +813,40 @@ def test_inference_image_segmentation(self): expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device) self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)) self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = CLIPSegModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = CLIPSegProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) From 14d600191d0cb0a44b8927ad812eaa8cd83e5716 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 17:50:57 +0200 Subject: [PATCH 06/49] formatting --- .../models/altclip/modeling_altclip.py | 4 +- .../chinese_clip/modeling_chinese_clip.py | 4 +- src/transformers/models/clip/modeling_clip.py | 4 +- src/transformers/models/git/modeling_git.py | 20 ++++++++-- .../models/x_clip/modeling_x_clip.py | 4 +- tests/models/clipseg/test_modeling_clipseg.py | 2 +- tests/models/git/test_modeling_git.py | 37 +++++++++++++++++++ 7 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 7186afb28b1f..cc48a20656e1 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -100,7 +100,7 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding(`bool`, defaults to `False`): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -139,6 +139,8 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 9ae56a727ca7..980f7c215717 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -839,7 +839,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -856,7 +856,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index e6131f888582..fcf45a74a7b8 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -735,7 +735,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -775,7 +775,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 32662f137e7a..7d3904375610 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -595,6 +595,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -958,6 +960,8 @@ def forward( output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -982,6 +986,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: r""" @@ -997,7 +1002,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1046,6 +1051,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: r""" @@ -1075,6 +1081,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1201,6 +1208,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: r""" @@ -1269,13 +1277,17 @@ def forward( if pixel_values is not None: if pixel_values.ndim == 4: # here we assume pixel_values is of shape (batch_size, num_channels, height, width) - visual_features = self.image_encoder(pixel_values).last_hidden_state + visual_features = self.image_encoder( + pixel_values, interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state elif pixel_values.ndim == 5: # here we assume pixel_values is of shape (batch_size, num_frames, num_channels, height, width) visual_features = [] for frame_idx in range(pixel_values.shape[1]): - visual_features_frame = self.image_encoder(pixel_values[:, frame_idx, :, :]).last_hidden_state + visual_features_frame = self.image_encoder( + pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state visual_features_frame += self.img_temperal_embedding[frame_idx] visual_features.append(visual_features_frame) @@ -1392,6 +1404,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]: r""" @@ -1545,6 +1558,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 23618c2ed399..d289ce286a26 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -609,7 +609,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -648,7 +648,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 0553284d7d94..df4904fd7bfd 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -849,4 +849,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) + ) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index a9c94f54f1fc..1a66fdd2d246 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -558,3 +558,40 @@ def test_batched_generation(self): generated_captions = processor.batch_decode(generated_ids, skip_special_tokens=True) self.assertEqual(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2) + + @slow + def test_inference_interpolate_pos_encoding(self): + # CLIP family models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = GitModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = GitProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) From 48128b11184138900233741f7d3d57fc0dd3b3a6 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 18:02:07 +0200 Subject: [PATCH 07/49] adding further testing to altclip and better documentation to kosmos2 --- src/transformers/models/kosmos2/modeling_kosmos2.py | 2 +- tests/models/altclip/test_modeling_altclip.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index ca27fa3ace23..51a7d14e1b7e 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -260,7 +260,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 7444cfc9f145..c6bf8ce5c905 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -614,12 +614,17 @@ def test_inference_interpolate_pos_encoding(self): image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + # forward pass with torch.no_grad(): outputs = model(**inputs, interpolate_pos_encoding=True) # verify the logits - expected_shape = torch.Size((1, 257, 1024)) + expected_shape = torch.Size((1, 145, 1024)) print("nilesh ") print(outputs.vision_model_output.last_hidden_state.shape) print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) From 8eb1beba84142280fa8accd443681c6dc02ea946 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 18:49:18 +0200 Subject: [PATCH 08/49] skipping test_inputs_embeds_matches_input_ids_with_generate in git model --- tests/models/git/test_modeling_git.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 1a66fdd2d246..985ebe5a0639 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -471,6 +471,10 @@ def test_contrastive_generate_dict_outputs_use_cache(self): def test_greedy_generate_dict_outputs_use_cache(self): pass + @unittest.skip(reason="GitForCausalLM does not support inputs_embeds in generate method") + def test_inputs_embeds_matches_input_ids_with_generate(self): + pass + @require_torch @require_vision From 7ced086acf1b054d3f2c3de1eab6054402e597ae Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 15 Aug 2024 23:58:26 +0200 Subject: [PATCH 09/49] fixing clipseg comment suggestions --- src/transformers/models/clip/modeling_clip.py | 1 - src/transformers/models/clipseg/modeling_clipseg.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index fcf45a74a7b8..bc42dd67b9d7 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -739,7 +739,6 @@ def _init_weights(self, module): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ CLIP_INPUTS_DOCSTRING = r""" diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index df71133115ef..1dfd67611243 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -526,7 +526,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -565,7 +565,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. From cac78866b3e003cddf3e5940782dc1b28d404cd3 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 00:00:15 +0200 Subject: [PATCH 10/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From a17b554f139a699de51f153e86dea49f625d8f10 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 00:47:25 +0200 Subject: [PATCH 11/49] fixing bridgetower test --- tests/models/bridgetower/test_modeling_bridgetower.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index eee422ef9cb7..9e21d013424e 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -666,9 +666,7 @@ def test_inference_interpolate_pos_encoding(self): model_name = "BridgeTower/bridgetower-base" model = BridgeTowerModel.from_pretrained(model_name).to(torch_device) - image_processor = BridgeTowerProcessor.from_pretrained( - model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} - ) + image_processor = BridgeTowerProcessor.from_pretrained(model_name, size={"shortest_edge": 180}) image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) @@ -683,12 +681,12 @@ def test_inference_interpolate_pos_encoding(self): outputs = model(**inputs, interpolate_pos_encoding=True) # verify the logits - expected_shape = torch.Size((1, 901, 768)) + expected_shape = torch.Size((1, 122, 768)) self.assertEqual(outputs.image_features.shape, expected_shape) expected_slice = torch.tensor( - [[0.3433, 0.4557, -0.5287], [-0.7111, 0.6576, -1.0850], [-0.2122, 0.2021, -0.0536]] + [[-0.6931, 0.5243, -0.4443], [-2.5986, -0.0715, -0.4051], [-2.5374, -0.0969, -0.4116]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) From c4e56fbce89a44f80e2c25ee9d163269e5036d90 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 00:59:44 +0200 Subject: [PATCH 12/49] fixing altclip tensor output POS test --- tests/models/altclip/test_modeling_altclip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index c6bf8ce5c905..3d81c4b62512 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -632,7 +632,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.5297, -0.7713, 0.4655], [0.8688, 0.1690, 0.6678], [1.1742, -0.7551, 0.0396]] + [[-0.3671, -0.5896, 0.3435], [ 0.3136, 0.1141, 0.7695], [ 1.1259, -0.5578, 0.1346]] ).to(torch_device) self.assertTrue( From e303547b5d40314865f4dc6a236912ef95e50b5c Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 01:06:06 +0200 Subject: [PATCH 13/49] adding ruff formatting --- tests/models/altclip/test_modeling_altclip.py | 2 +- tests/models/bridgetower/test_modeling_bridgetower.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 3d81c4b62512..82b4f70c491a 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -632,7 +632,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.3671, -0.5896, 0.3435], [ 0.3136, 0.1141, 0.7695], [ 1.1259, -0.5578, 0.1346]] + [[-0.3671, -0.5896, 0.3435], [0.3136, 0.1141, 0.7695], [1.1259, -0.5578, 0.1346]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index 9e21d013424e..5ae65820b72b 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -686,7 +686,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.image_features.shape, expected_shape) expected_slice = torch.tensor( - [[-0.6931, 0.5243, -0.4443], [-2.5986, -0.0715, -0.4051], [-2.5374, -0.0969, -0.4116]] + [[-0.6931, 0.5243, -0.4443], [-2.5986, -0.0715, -0.4051], [-2.5374, -0.0969, -0.4116]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) From ee8318dc6f02d0769d4e87d8d47c4bb9473ddf9c Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 22:31:51 +0200 Subject: [PATCH 14/49] fixing several tests --- tests/models/chinese_clip/test_modeling_chinese_clip.py | 2 +- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/clipseg/test_modeling_clipseg.py | 4 +++- tests/models/kosmos2/test_modeling_kosmos2.py | 4 ++-- tests/models/x_clip/test_modeling_x_clip.py | 4 +--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 18b14ead493c..f6ec75a68fbb 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -772,7 +772,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.3374, 0.3212, -0.1293], [-0.2208, -0.6150, 0.7010], [-0.1901, -0.6576, 0.4843]] + [[-0.4035, 0.3008, -0.1223], [-0.1505, -0.2903, 0.0250], [-0.3128, -0.5132, 0.8456]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 1ae105264a33..f2c62b375434 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1210,7 +1210,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + [[-0.1599, 0.0276, -0.3315],[ 0.2613, 0.1183, -0.5668],[ 0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index df4904fd7bfd..88b58a1ec54c 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -844,7 +844,9 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + [[-0.1599, 0.0276, -0.3315], + [ 0.2613, 0.1183, -0.5668], + [ 0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 1919ba09aa35..293e1642888a 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -788,12 +788,12 @@ def test_inference_interpolate_pos_encoding(self): outputs = model(**inputs, interpolate_pos_encoding=True) # verify the logits - expected_shape = torch.Size((1, 257, 1024)) + expected_shape = torch.Size((1, 145, 1024)) self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[1.4228, -1.9611, 3.8449], [3.4988, 2.0516, 0.3597], [3.1699, 0.2604, -0.4210]] + [[ 1.0551, -1.1680, 3.2926], [ 2.7077, 0.0720, -0.7721], [ 1.5863, 0.1665, -0.5936]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 92c531fc9c21..1472685418e3 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -761,11 +761,9 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[0.1806, 0.3649, -0.0850], [0.0210, 0.3411, -0.0637], [0.2307, 0.3106, -0.2027]] + [[-0.0312, 0.2034, 0.0556], [ 0.0323, 0.5420, -0.1794], [-0.0952, 0.7817, -0.3186]] ).to(torch_device) - print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) - self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) ) From 20778a399ec402af731fd0e4d28240b516179731 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 22:34:37 +0200 Subject: [PATCH 15/49] formatting with ruff --- tests/models/chinese_clip/test_modeling_chinese_clip.py | 2 +- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/clipseg/test_modeling_clipseg.py | 4 +--- tests/models/kosmos2/test_modeling_kosmos2.py | 2 +- tests/models/x_clip/test_modeling_x_clip.py | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index f6ec75a68fbb..072ce9b41735 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -772,7 +772,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.4035, 0.3008, -0.1223], [-0.1505, -0.2903, 0.0250], [-0.3128, -0.5132, 0.8456]] + [[-0.4035, 0.3008, -0.1223], [-0.1505, -0.2903, 0.0250], [-0.3128, -0.5132, 0.8456]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index f2c62b375434..05e5f2f7dcb9 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1210,7 +1210,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.1599, 0.0276, -0.3315],[ 0.2613, 0.1183, -0.5668],[ 0.0244, 0.1978, -0.6078]] + [[-0.1599, 0.0276, -0.3315], [0.2613, 0.1183, -0.5668], [0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 88b58a1ec54c..73ebe3f3fa74 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -844,9 +844,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.1599, 0.0276, -0.3315], - [ 0.2613, 0.1183, -0.5668], - [ 0.0244, 0.1978, -0.6078]] + [[-0.1599, 0.0276, -0.3315], [0.2613, 0.1183, -0.5668], [0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 293e1642888a..47d10934edd1 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -793,7 +793,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[ 1.0551, -1.1680, 3.2926], [ 2.7077, 0.0720, -0.7721], [ 1.5863, 0.1665, -0.5936]] + [[1.0551, -1.1680, 3.2926], [2.7077, 0.0720, -0.7721], [1.5863, 0.1665, -0.5936]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 1472685418e3..57139d288e4f 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -761,7 +761,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0312, 0.2034, 0.0556], [ 0.0323, 0.5420, -0.1794], [-0.0952, 0.7817, -0.3186]] + [[-0.0312, 0.2034, 0.0556], [0.0323, 0.5420, -0.1794], [-0.0952, 0.7817, -0.3186]] ).to(torch_device) self.assertTrue( From 024ea6ed54b3eec9dda2ca6362dda9edf2a5f572 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Sun, 11 Aug 2024 19:21:33 +0200 Subject: [PATCH 16/49] adding positional encoder changes and tests --- .../models/altclip/modeling_altclip.py | 58 +++++++++++++- .../bridgetower/modeling_bridgetower.py | 75 ++++++++++++++++--- .../chinese_clip/modeling_chinese_clip.py | 60 ++++++++++++++- src/transformers/models/clip/modeling_clip.py | 64 +++++++++++++++- src/transformers/models/git/modeling_git.py | 43 ++++++++++- .../models/kosmos2/modeling_kosmos2.py | 58 +++++++++++++- .../models/x_clip/modeling_x_clip.py | 57 +++++++++++++- tests/models/altclip/test_modeling_altclip.py | 36 +++++++++ .../bridgetower/test_modeling_bridgetower.py | 36 +++++++++ .../test_modeling_chinese_clip.py | 38 ++++++++++ tests/models/clip/test_modeling_clip.py | 37 +++++++++ tests/models/kosmos2/test_modeling_kosmos2.py | 37 +++++++++ tests/models/x_clip/test_modeling_x_clip.py | 38 ++++++++++ 13 files changed, 604 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index f9856ef701f9..7186afb28b1f 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -100,6 +100,8 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding(`bool`, defaults to `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -1009,15 +1011,56 @@ def __init__(self, config: AltCLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -1097,6 +1140,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -1111,7 +1155,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1156,6 +1200,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1186,6 +1231,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1546,6 +1592,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1578,6 +1625,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1598,6 +1646,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, AltCLIPOutput]: r""" @@ -1642,6 +1691,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 91cbda9b72ed..2075ca007439 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -111,6 +111,8 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, defaults to `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -276,15 +278,56 @@ def __init__(self, config: BridgeTowerVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -302,8 +345,13 @@ def __init__(self, config): [nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) for _ in range(config.num_hidden_layers)] ) - def forward(self, pixel_values: torch.Tensor, attention_mask): - hidden_states = self.embeddings(pixel_values) + def forward( + self, + pixel_values: torch.Tensor, + attention_mask, + interpolate_pos_encoding: bool = False, + ): + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding) hidden_states = self.ln_pre(hidden_states) # NLD -> LND hidden_states = hidden_states.permute(1, 0, 2) @@ -324,8 +372,12 @@ def forward(self, pixel_values: torch.Tensor, attention_mask): hidden_states = torch.stack(hidden_states_stack, dim=0) return hidden_states - def forward_pre(self, pixel_values: torch.Tensor): - hidden_states = self.embeddings(pixel_values) + def forward_pre( + self, + pixel_values: torch.Tensor, + interpolate_pos_encoding: bool = False, + ): + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.ln_pre(hidden_states) # NLD -> LND hidden_states = hidden_states.permute(1, 0, 2) @@ -1015,8 +1067,8 @@ def __init__(self, config): def dtype(self): return self.visual.embeddings.patch_embedding.weight.dtype - def forward(self, image, image_mask=None): - return self.visual(image.type(self.dtype), image_mask) + def forward(self, image, image_mask=None, interpolate_pos_encoding=False): + return self.visual(image.type(self.dtype), image_mask, interpolate_pos_encoding) class BridgeTowerTextModel(BridgeTowerPreTrainedModel): @@ -1280,6 +1332,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, ) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]: r""" output_hidden_states (`bool`, *optional*): @@ -1352,7 +1405,9 @@ def forward( all_hidden_states_text += (text_embeds,) if image_embeds is None: - image_embeds = self.vision_model.visual.forward_pre(pixel_values.type(self.vision_model.dtype)) + image_embeds = self.vision_model.visual.forward_pre( + pixel_values.type(self.vision_model.dtype), interpolate_pos_encoding=interpolate_pos_encoding + ) else: # Permute as BridgeTowerResidualAttention has batch_first=True image_embeds = image_embeds.permute(1, 0, 2) diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 6fbd9459f5ad..9ae56a727ca7 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -188,15 +188,56 @@ def __init__(self, config: ChineseCLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -798,6 +839,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -813,6 +856,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -1052,6 +1097,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1066,7 +1112,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1299,6 +1345,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1329,6 +1376,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1425,6 +1473,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1461,6 +1510,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1481,6 +1531,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, ChineseCLIPOutput]: r""" @@ -1516,6 +1567,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index d5f12c9fe413..029a02bd8ad5 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -14,6 +14,7 @@ # limitations under the License. """PyTorch CLIP model.""" +import math from dataclasses import dataclass from typing import Any, Optional, Tuple, Union @@ -196,15 +197,56 @@ def __init__(self, config: CLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -704,8 +746,11 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ CLIP_INPUTS_DOCSTRING = r""" @@ -741,6 +786,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -1023,6 +1070,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -1037,7 +1085,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1087,6 +1135,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1118,6 +1167,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, ) @@ -1214,6 +1264,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1249,6 +1300,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1268,6 +1320,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPOutput]: r""" @@ -1305,6 +1358,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1466,6 +1520,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPVisionModelOutput]: r""" @@ -1495,6 +1550,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 581f2b3947b4..f120896d11ec 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -624,15 +624,52 @@ def __init__(self, config: GitVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 69641790b2db..ca27fa3ace23 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -120,6 +120,8 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -258,6 +260,8 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -400,15 +404,56 @@ def __init__(self, config: Kosmos2VisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -700,6 +745,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -711,7 +757,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1442,6 +1488,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1452,6 +1499,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1768,6 +1816,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, Kosmos2ModelOutput]: r""" @@ -1819,6 +1868,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`. diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 791e501d1737..23618c2ed399 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -14,6 +14,7 @@ # limitations under the License. """PyTorch X-CLIP model.""" +import math from copy import copy from dataclasses import dataclass from typing import Any, Optional, Tuple, Union @@ -121,15 +122,56 @@ def __init__(self, config: XCLIPVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings @@ -567,6 +609,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -604,6 +648,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -954,6 +1000,7 @@ def forward( pixel_values: torch.FloatTensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -966,7 +1013,7 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layernorm(hidden_states) encoder_outputs = self.encoder( @@ -1455,6 +1502,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, XCLIPOutput]: r""" @@ -1555,6 +1603,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 83b6d60595d3..ef713753aa54 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -597,3 +597,39 @@ def test_inference(self): expected_probs = torch.tensor([[9.9942e-01, 5.7805e-04]], device=torch_device) self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model_name = "BAAI/AltCLIP" + model = AltCLIPModel.from_pretrained(model_name).to(torch_device) + + image_processor = AltCLIPProcessor.from_pretrained( + model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 257, 1024)) + print("nilesh ") + print(outputs.vision_model_output.last_hidden_state.shape) + print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.5297, -0.7713, 0.4655], [0.8688, 0.1690, 0.6678], [1.1742, -0.7551, 0.0396]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index 44e6a404f623..cae36f151e44 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -656,3 +656,39 @@ def test_training(self): for name, param in model.named_parameters(): if self._is_layer_used(model_class, name): self.assertIsNotNone(param.grad, f"Gradients should not be None - got {param.grad} for {name}") + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model_name = "BridgeTower/bridgetower-base" + model = BridgeTowerModel.from_pretrained(model_name).to(torch_device) + + image_processor = BridgeTowerProcessor.from_pretrained( + model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 901, 768)) + + self.assertEqual(outputs.image_features.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.3433, 0.4557, -0.5287], [-0.7111, 0.6576, -1.0850], [-0.2122, 0.2021, -0.0536]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) \ No newline at end of file diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 7046f28b5f94..41dc1c16f9a8 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -740,3 +740,41 @@ def test_inference(self): expected_probs = torch.tensor([[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]], device=torch_device) self.assertTrue(torch.allclose(probs, expected_probs, atol=5e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model_name = "OFA-Sys/chinese-clip-vit-base-patch16" + model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device) + + image_processor = ChineseCLIPProcessor.from_pretrained( + model_name, size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 122, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.3374, 0.3212, -0.1293], [-0.2208, -0.6150, 0.7010], [-0.1901, -0.6576, 0.4843]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 3b6994428088..170a5baa9b4d 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1179,3 +1179,40 @@ def test_inference(self): expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device) self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = CLIPProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 6f34689004ef..0a2dc3c74d0d 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -762,3 +762,40 @@ def test_snowman_image_captioning_batch(self): self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0) self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0) self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) + + processor = AutoProcessor.from_pretrained( + "microsoft/kosmos-2-patch14-224", size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 257, 1024)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[1.4228, -1.9611, 3.8449], [3.4988, 2.0516, 0.3597], [3.1699, 0.2604, -0.4210]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 70e7bb341c7e..ddcedcb93236 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -731,3 +731,41 @@ def test_inference(self): expected_logits = torch.tensor([[14.0181, 20.2771, 14.4776]], device=torch_device) self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # XCLIP models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32").to(torch_device) + + processor = XCLIPProcessor.from_pretrained( + "microsoft/xclip-base-patch32", size=180, crop_size={"height": 180, "width": 180} + ) + + video = prepare_video() + inputs = processor(text="what's in the video", videos=video, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((8, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.1806, 0.3649, -0.0850], [0.0210, 0.3411, -0.0637], [0.2307, 0.3106, -0.2027]] + ).to(torch_device) + + print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) \ No newline at end of file From 9c645e31451e022cd7622a2bfe79b621950d33a3 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Sun, 11 Aug 2024 23:53:19 +0200 Subject: [PATCH 17/49] adding ruff suggestions --- tests/models/bridgetower/test_modeling_bridgetower.py | 2 +- tests/models/chinese_clip/test_modeling_chinese_clip.py | 2 +- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/kosmos2/test_modeling_kosmos2.py | 2 +- tests/models/x_clip/test_modeling_x_clip.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index cae36f151e44..eee422ef9cb7 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -691,4 +691,4 @@ def test_inference_interpolate_pos_encoding(self): [[0.3433, 0.4557, -0.5287], [-0.7111, 0.6576, -1.0850], [-0.2122, 0.2021, -0.0536]] ).to(torch_device) - self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) \ No newline at end of file + self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 41dc1c16f9a8..18b14ead493c 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -777,4 +777,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 170a5baa9b4d..f49207e01ab3 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1215,4 +1215,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 0a2dc3c74d0d..1919ba09aa35 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -798,4 +798,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index ddcedcb93236..92c531fc9c21 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -768,4 +768,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) From 19ad494141e7795a38491fc126588895318d412f Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 00:01:00 +0200 Subject: [PATCH 18/49] changes added by python utils/check_copies.py --fix_and_overwrite --- src/transformers/models/clipseg/modeling_clipseg.py | 3 ++- src/transformers/models/git/modeling_git.py | 4 ++++ tests/models/altclip/test_modeling_altclip.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 97fcf3d1f2b3..ed94e90ba994 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -825,6 +825,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -839,7 +840,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index f120896d11ec..32662f137e7a 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -660,6 +660,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index ef713753aa54..7444cfc9f145 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -632,4 +632,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) \ No newline at end of file + ) From b383517debd46dc166b8ae159dd443f47adf0e45 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 00:27:46 +0200 Subject: [PATCH 19/49] removing pos_encoding added by script --- src/transformers/models/clipseg/modeling_clipseg.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index ed94e90ba994..97fcf3d1f2b3 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -825,7 +825,6 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -840,7 +839,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.embeddings(pixel_values) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( From 578411c1c4cbb76b60220aab4b4dd14c8c223e23 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 01:48:40 +0200 Subject: [PATCH 20/49] adding interpolation to clipseg --- .../models/clipseg/modeling_clipseg.py | 81 ++++++++++++------- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/clipseg/test_modeling_clipseg.py | 37 +++++++++ 3 files changed, 92 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 97fcf3d1f2b3..df71133115ef 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -163,40 +163,54 @@ def __init__(self, config: CLIPSegVisionConfig): self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - def interpolate_position_embeddings(self, new_size): - if len(new_size) != 2: - raise ValueError("new_size should consist of 2 values") - - num_patches_one_direction = int(self.num_patches**0.5) - # we interpolate the position embeddings in 2D - a = self.position_embedding.weight[1:].T.view( - 1, self.config.hidden_size, num_patches_one_direction, num_patches_one_direction - ) - b = ( - nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False) - .squeeze(0) - .view(self.config.hidden_size, new_size[0] * new_size[1]) - .T + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = embeddings.shape[-1] + height = height // self.config.patch_size + width = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + height, width = height + 0.1, width + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, ) - result = torch.cat([self.position_embedding.weight[:1], b]) - - return result - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - - if embeddings.shape[1] != self.num_positions: - new_shape = int(math.sqrt(embeddings.shape[1] - 1)) - embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape)) - embeddings = embeddings.to(embeddings.dtype) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) else: embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings @@ -512,6 +526,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -549,6 +565,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -825,6 +843,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -839,7 +858,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -884,6 +903,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -912,6 +932,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1005,6 +1026,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1040,6 +1062,7 @@ def get_image_features( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1059,6 +1082,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" @@ -1096,6 +1120,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1363,6 +1388,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" @@ -1402,6 +1428,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=True, # we need the intermediate hidden states + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) pooled_output = self.clip.visual_projection(vision_outputs[1]) diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index f49207e01ab3..1ae105264a33 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1182,7 +1182,7 @@ def test_inference(self): @slow def test_inference_interpolate_pos_encoding(self): - # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # CLIP models have an `interpolate_pos_encoding` argument in their forward method, # allowing to interpolate the pre-trained position embeddings in order to use # the model on higher resolutions. The DINO model by Facebook AI leverages this # to visualize self-attention on higher resolution images. diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index a6f286c4c6b7..0553284d7d94 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -813,3 +813,40 @@ def test_inference_image_segmentation(self): expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device) self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)) self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = CLIPSegModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = CLIPSegProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) From 5517dab12858f015cba4655b50b4338465a6d959 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 17:50:57 +0200 Subject: [PATCH 21/49] formatting --- .../models/altclip/modeling_altclip.py | 4 +- .../chinese_clip/modeling_chinese_clip.py | 4 +- src/transformers/models/clip/modeling_clip.py | 4 +- src/transformers/models/git/modeling_git.py | 20 ++++++++-- .../models/x_clip/modeling_x_clip.py | 4 +- tests/models/clipseg/test_modeling_clipseg.py | 2 +- tests/models/git/test_modeling_git.py | 37 +++++++++++++++++++ 7 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 7186afb28b1f..cc48a20656e1 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -100,7 +100,7 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding(`bool`, defaults to `False`): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -139,6 +139,8 @@ output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 9ae56a727ca7..980f7c215717 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -839,7 +839,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -856,7 +856,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 029a02bd8ad5..5dfb3ee0b398 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -746,7 +746,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -786,7 +786,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 32662f137e7a..7d3904375610 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -595,6 +595,8 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -958,6 +960,8 @@ def forward( output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ @@ -982,6 +986,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: r""" @@ -997,7 +1002,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values) + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs = self.encoder( @@ -1046,6 +1051,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: r""" @@ -1075,6 +1081,7 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) @@ -1201,6 +1208,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: r""" @@ -1269,13 +1277,17 @@ def forward( if pixel_values is not None: if pixel_values.ndim == 4: # here we assume pixel_values is of shape (batch_size, num_channels, height, width) - visual_features = self.image_encoder(pixel_values).last_hidden_state + visual_features = self.image_encoder( + pixel_values, interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state elif pixel_values.ndim == 5: # here we assume pixel_values is of shape (batch_size, num_frames, num_channels, height, width) visual_features = [] for frame_idx in range(pixel_values.shape[1]): - visual_features_frame = self.image_encoder(pixel_values[:, frame_idx, :, :]).last_hidden_state + visual_features_frame = self.image_encoder( + pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state visual_features_frame += self.img_temperal_embedding[frame_idx] visual_features.append(visual_features_frame) @@ -1392,6 +1404,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]: r""" @@ -1545,6 +1558,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, ) diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 23618c2ed399..d289ce286a26 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -609,7 +609,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -648,7 +648,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 0553284d7d94..df4904fd7bfd 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -849,4 +849,4 @@ def test_inference_interpolate_pos_encoding(self): self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) + ) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index a9c94f54f1fc..1a66fdd2d246 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -558,3 +558,40 @@ def test_batched_generation(self): generated_captions = processor.batch_decode(generated_ids, skip_special_tokens=True) self.assertEqual(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2) + + @slow + def test_inference_interpolate_pos_encoding(self): + # CLIP family models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = GitModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = GitProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + ).to(torch_device) + + self.assertTrue( + torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + ) From 34d8999b9f0107d29eec3879bb705a4bd4a9ed52 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 18:02:07 +0200 Subject: [PATCH 22/49] adding further testing to altclip and better documentation to kosmos2 --- src/transformers/models/kosmos2/modeling_kosmos2.py | 2 +- tests/models/altclip/test_modeling_altclip.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index ca27fa3ace23..51a7d14e1b7e 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -260,7 +260,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 7444cfc9f145..c6bf8ce5c905 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -614,12 +614,17 @@ def test_inference_interpolate_pos_encoding(self): image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + # forward pass with torch.no_grad(): outputs = model(**inputs, interpolate_pos_encoding=True) # verify the logits - expected_shape = torch.Size((1, 257, 1024)) + expected_shape = torch.Size((1, 145, 1024)) print("nilesh ") print(outputs.vision_model_output.last_hidden_state.shape) print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) From 48be16e17d19fad6eced6b6f2e1de6677cd81c50 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Mon, 12 Aug 2024 18:49:18 +0200 Subject: [PATCH 23/49] skipping test_inputs_embeds_matches_input_ids_with_generate in git model --- tests/models/git/test_modeling_git.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 1a66fdd2d246..985ebe5a0639 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -471,6 +471,10 @@ def test_contrastive_generate_dict_outputs_use_cache(self): def test_greedy_generate_dict_outputs_use_cache(self): pass + @unittest.skip(reason="GitForCausalLM does not support inputs_embeds in generate method") + def test_inputs_embeds_matches_input_ids_with_generate(self): + pass + @require_torch @require_vision From 633310a1ba93c104eb33de6410c2526a82ece6bd Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 15 Aug 2024 23:58:26 +0200 Subject: [PATCH 24/49] fixing clipseg comment suggestions --- src/transformers/models/clip/modeling_clip.py | 1 - src/transformers/models/clipseg/modeling_clipseg.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 5dfb3ee0b398..893a89f92e91 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -750,7 +750,6 @@ def _init_weights(self, module): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ CLIP_INPUTS_DOCSTRING = r""" diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index df71133115ef..1dfd67611243 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -526,7 +526,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -565,7 +565,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*): + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. From 9c3ccddcdcce623dc16e3acdc26a69a70db6e91e Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 00:47:25 +0200 Subject: [PATCH 25/49] fixing bridgetower test --- tests/models/bridgetower/test_modeling_bridgetower.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index eee422ef9cb7..9e21d013424e 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -666,9 +666,7 @@ def test_inference_interpolate_pos_encoding(self): model_name = "BridgeTower/bridgetower-base" model = BridgeTowerModel.from_pretrained(model_name).to(torch_device) - image_processor = BridgeTowerProcessor.from_pretrained( - model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180} - ) + image_processor = BridgeTowerProcessor.from_pretrained(model_name, size={"shortest_edge": 180}) image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) @@ -683,12 +681,12 @@ def test_inference_interpolate_pos_encoding(self): outputs = model(**inputs, interpolate_pos_encoding=True) # verify the logits - expected_shape = torch.Size((1, 901, 768)) + expected_shape = torch.Size((1, 122, 768)) self.assertEqual(outputs.image_features.shape, expected_shape) expected_slice = torch.tensor( - [[0.3433, 0.4557, -0.5287], [-0.7111, 0.6576, -1.0850], [-0.2122, 0.2021, -0.0536]] + [[-0.6931, 0.5243, -0.4443], [-2.5986, -0.0715, -0.4051], [-2.5374, -0.0969, -0.4116]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) From 3a62e941176043e4ac800b897c35bb9a514fd396 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 00:59:44 +0200 Subject: [PATCH 26/49] fixing altclip tensor output POS test --- tests/models/altclip/test_modeling_altclip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index c6bf8ce5c905..3d81c4b62512 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -632,7 +632,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.5297, -0.7713, 0.4655], [0.8688, 0.1690, 0.6678], [1.1742, -0.7551, 0.0396]] + [[-0.3671, -0.5896, 0.3435], [ 0.3136, 0.1141, 0.7695], [ 1.1259, -0.5578, 0.1346]] ).to(torch_device) self.assertTrue( From 153938fd0a15744ab95d1aac2e3313ea504067c2 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 01:06:06 +0200 Subject: [PATCH 27/49] adding ruff formatting --- tests/models/altclip/test_modeling_altclip.py | 2 +- tests/models/bridgetower/test_modeling_bridgetower.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 3d81c4b62512..82b4f70c491a 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -632,7 +632,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.3671, -0.5896, 0.3435], [ 0.3136, 0.1141, 0.7695], [ 1.1259, -0.5578, 0.1346]] + [[-0.3671, -0.5896, 0.3435], [0.3136, 0.1141, 0.7695], [1.1259, -0.5578, 0.1346]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index 9e21d013424e..5ae65820b72b 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -686,7 +686,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.image_features.shape, expected_shape) expected_slice = torch.tensor( - [[-0.6931, 0.5243, -0.4443], [-2.5986, -0.0715, -0.4051], [-2.5374, -0.0969, -0.4116]] + [[-0.6931, 0.5243, -0.4443], [-2.5986, -0.0715, -0.4051], [-2.5374, -0.0969, -0.4116]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) From ca9682de5ed5c117b18af8d31005dcd26c628042 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 22:31:51 +0200 Subject: [PATCH 28/49] fixing several tests --- tests/models/chinese_clip/test_modeling_chinese_clip.py | 2 +- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/clipseg/test_modeling_clipseg.py | 4 +++- tests/models/kosmos2/test_modeling_kosmos2.py | 4 ++-- tests/models/x_clip/test_modeling_x_clip.py | 4 +--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 18b14ead493c..f6ec75a68fbb 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -772,7 +772,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.3374, 0.3212, -0.1293], [-0.2208, -0.6150, 0.7010], [-0.1901, -0.6576, 0.4843]] + [[-0.4035, 0.3008, -0.1223], [-0.1505, -0.2903, 0.0250], [-0.3128, -0.5132, 0.8456]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 1ae105264a33..f2c62b375434 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1210,7 +1210,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + [[-0.1599, 0.0276, -0.3315],[ 0.2613, 0.1183, -0.5668],[ 0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index df4904fd7bfd..88b58a1ec54c 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -844,7 +844,9 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + [[-0.1599, 0.0276, -0.3315], + [ 0.2613, 0.1183, -0.5668], + [ 0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 1919ba09aa35..293e1642888a 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -788,12 +788,12 @@ def test_inference_interpolate_pos_encoding(self): outputs = model(**inputs, interpolate_pos_encoding=True) # verify the logits - expected_shape = torch.Size((1, 257, 1024)) + expected_shape = torch.Size((1, 145, 1024)) self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[1.4228, -1.9611, 3.8449], [3.4988, 2.0516, 0.3597], [3.1699, 0.2604, -0.4210]] + [[ 1.0551, -1.1680, 3.2926], [ 2.7077, 0.0720, -0.7721], [ 1.5863, 0.1665, -0.5936]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 92c531fc9c21..1472685418e3 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -761,11 +761,9 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[0.1806, 0.3649, -0.0850], [0.0210, 0.3411, -0.0637], [0.2307, 0.3106, -0.2027]] + [[-0.0312, 0.2034, 0.0556], [ 0.0323, 0.5420, -0.1794], [-0.0952, 0.7817, -0.3186]] ).to(torch_device) - print(outputs.vision_model_output.last_hidden_state[0, :3, :3]) - self.assertTrue( torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) ) From 85674081f839ecdd29e91ffd0b33258e498dff86 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Fri, 16 Aug 2024 22:34:37 +0200 Subject: [PATCH 29/49] formatting with ruff --- tests/models/chinese_clip/test_modeling_chinese_clip.py | 2 +- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/clipseg/test_modeling_clipseg.py | 4 +--- tests/models/kosmos2/test_modeling_kosmos2.py | 2 +- tests/models/x_clip/test_modeling_x_clip.py | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index f6ec75a68fbb..072ce9b41735 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -772,7 +772,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.4035, 0.3008, -0.1223], [-0.1505, -0.2903, 0.0250], [-0.3128, -0.5132, 0.8456]] + [[-0.4035, 0.3008, -0.1223], [-0.1505, -0.2903, 0.0250], [-0.3128, -0.5132, 0.8456]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index f2c62b375434..05e5f2f7dcb9 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1210,7 +1210,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.1599, 0.0276, -0.3315],[ 0.2613, 0.1183, -0.5668],[ 0.0244, 0.1978, -0.6078]] + [[-0.1599, 0.0276, -0.3315], [0.2613, 0.1183, -0.5668], [0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 88b58a1ec54c..73ebe3f3fa74 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -844,9 +844,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.1599, 0.0276, -0.3315], - [ 0.2613, 0.1183, -0.5668], - [ 0.0244, 0.1978, -0.6078]] + [[-0.1599, 0.0276, -0.3315], [0.2613, 0.1183, -0.5668], [0.0244, 0.1978, -0.6078]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 293e1642888a..47d10934edd1 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -793,7 +793,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[ 1.0551, -1.1680, 3.2926], [ 2.7077, 0.0720, -0.7721], [ 1.5863, 0.1665, -0.5936]] + [[1.0551, -1.1680, 3.2926], [2.7077, 0.0720, -0.7721], [1.5863, 0.1665, -0.5936]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 1472685418e3..57139d288e4f 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -761,7 +761,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0312, 0.2034, 0.0556], [ 0.0323, 0.5420, -0.1794], [-0.0952, 0.7817, -0.3186]] + [[-0.0312, 0.2034, 0.0556], [0.0323, 0.5420, -0.1794], [-0.0952, 0.7817, -0.3186]] ).to(torch_device) self.assertTrue( From 9941dbd49a8c969c17cf94b759bd27e2d1025d39 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 21 Aug 2024 20:28:33 +0200 Subject: [PATCH 30/49] adding right pretrained model --- tests/models/git/test_modeling_git.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 985ebe5a0639..a2757258eb05 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -569,10 +569,10 @@ def test_inference_interpolate_pos_encoding(self): # allowing to interpolate the pre-trained position embeddings in order to use # the model on higher resolutions. The DINO model by Facebook AI leverages this # to visualize self-attention on higher resolution images. - model = GitModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + model = GitModel.from_pretrained("microsoft/git-base").to(torch_device) processor = GitProcessor.from_pretrained( - "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + "microsoft/git-base", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} ) image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") From 9d0557202acc1df87c17fca5c9492fd3da170f3d Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Tue, 3 Sep 2024 15:29:32 +0100 Subject: [PATCH 31/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From 16363f628a9857f40de2a26725d708a288a4e96b Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 5 Sep 2024 22:30:24 +0200 Subject: [PATCH 32/49] fixing test_inference_image_segmentation --- tests/models/clipseg/test_modeling_clipseg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 73ebe3f3fa74..d724c6cef114 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -796,7 +796,7 @@ def test_inference_image_segmentation(self): # forward pass with torch.no_grad(): - outputs = model(**inputs) + outputs = model(**inputs, interpolate_pos_encoding=True) # verify the predicted masks self.assertEqual( @@ -804,13 +804,13 @@ def test_inference_image_segmentation(self): torch.Size((3, 352, 352)), ) expected_masks_slice = torch.tensor( - [[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]] + [[-7.4729, -7.4890, -7.3732], [-7.3379, -7.1001, -7.1432], [-6.9942, -6.8002, -6.9010]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)) # verify conditional and pooled output expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980]).to(torch_device) - expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device) + expected_pooled_output = torch.tensor([0.5050, -0.2674, -0.2627]).to(torch_device) self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)) self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)) From e3e227274c6378c6bfac679d553c711c8b18f825 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 5 Sep 2024 22:30:46 +0200 Subject: [PATCH 33/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From e35729a7616fdc5fb240346e99f198ae8ff53863 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 5 Sep 2024 23:17:49 +0200 Subject: [PATCH 34/49] fixing test_inference_interpolate_pos_encoding for the git model as there is no vision_model_output --- tests/models/git/test_modeling_git.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 167717f7c888..45611c46f72c 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -589,14 +589,14 @@ def test_inference_interpolate_pos_encoding(self): outputs = model(**inputs, interpolate_pos_encoding=True) # verify the logits - expected_shape = torch.Size((1, 26, 768)) + expected_shape = torch.Size((1, 130, 768)) - self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]] + [[-1.0274, 2.6038, 0.8594], [ 1.6899, 1.3264, -0.5352], [-1.4955, -0.1172, 0.0266]] ).to(torch_device) self.assertTrue( - torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) + torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) ) From 58a02f107600ff38362430abbeca6731264c93f9 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 5 Sep 2024 23:18:30 +0200 Subject: [PATCH 35/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From fcbf2d2328135dd90de4af48209d671159e612b4 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 5 Sep 2024 23:24:55 +0200 Subject: [PATCH 36/49] adding ruff formatting --- tests/models/git/test_modeling_git.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 45611c46f72c..ef732082d036 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -594,9 +594,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-1.0274, 2.6038, 0.8594], [ 1.6899, 1.3264, -0.5352], [-1.4955, -0.1172, 0.0266]] + [[-1.0274, 2.6038, 0.8594], [1.6899, 1.3264, -0.5352], [-1.4955, -0.1172, 0.0266]] ).to(torch_device) - self.assertTrue( - torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4) - ) + self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) From d44e07030f7ef63ab79e134f8f6650ec1a546147 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Thu, 5 Sep 2024 23:25:07 +0200 Subject: [PATCH 37/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From ea54d256ad6f22a86ab79694b1ff83cf563bd128 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Sun, 15 Sep 2024 16:09:16 +0200 Subject: [PATCH 38/49] adding new interpolate_pos_encoding function --- .../models/altclip/modeling_altclip.py | 48 +++++++++-------- .../bridgetower/modeling_bridgetower.py | 54 +++++++++++-------- .../chinese_clip/modeling_chinese_clip.py | 47 +++++++++------- src/transformers/models/clip/modeling_clip.py | 48 +++++++++-------- .../models/clipseg/modeling_clipseg.py | 48 ++++++++++------- src/transformers/models/git/modeling_git.py | 54 +++++++++++-------- .../models/kosmos2/modeling_kosmos2.py | 47 +++++++++------- .../models/x_clip/modeling_x_clip.py | 48 +++++++++-------- 8 files changed, 229 insertions(+), 165 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index b6b7d6683dd5..984850250863 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -32,7 +32,7 @@ ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, torch_int from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig @@ -1015,37 +1015,43 @@ def __init__(self, config: AltCLIPVisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 80a249f429c2..05100382d9a3 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -34,7 +34,13 @@ ) from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig @@ -280,37 +286,43 @@ def __init__(self, config: BridgeTowerVisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 980f7c215717..22e45108a0e7 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -38,6 +38,7 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + torch_int, ) from .configuration_chinese_clip import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig @@ -190,37 +191,43 @@ def __init__(self, config: ChineseCLIPVisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 64bec3c7da1b..6f32e81b7133 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -14,7 +14,6 @@ # limitations under the License. """PyTorch CLIP model.""" -import math from dataclasses import dataclass from typing import Any, Optional, Tuple, Union @@ -37,6 +36,7 @@ is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings, + torch_int, ) from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig @@ -199,37 +199,43 @@ def __init__(self, config: CLIPVisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 560234779752..356db0ddffd3 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -33,6 +33,7 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + torch_int, ) from .configuration_clipseg import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig @@ -165,36 +166,43 @@ def __init__(self, config: CLIPSegVisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 1a216b03614a..4fa925294b5a 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -36,7 +36,13 @@ ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) from .configuration_git import GitConfig, GitVisionConfig @@ -628,37 +634,43 @@ def __init__(self, config: GitVisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 51a7d14e1b7e..25557f0d676b 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -37,6 +37,7 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + torch_int, ) from .configuration_kosmos2 import Kosmos2Config, Kosmos2TextConfig, Kosmos2VisionConfig @@ -406,37 +407,43 @@ def __init__(self, config: Kosmos2VisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index d289ce286a26..763c2d0b0e89 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -14,7 +14,6 @@ # limitations under the License. """PyTorch X-CLIP model.""" -import math from copy import copy from dataclasses import dataclass from typing import Any, Optional, Tuple, Union @@ -33,6 +32,7 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + torch_int, ) from .configuration_x_clip import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig @@ -124,37 +124,43 @@ def __init__(self, config: XCLIPVisionConfig): def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - position_embeddings = self.position_embedding.weight.unsqueeze(0) + num_patches = embeddings.shape[1] - 1 - num_positions = position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: - return position_embeddings - class_pos_embed = position_embeddings[:, 0] - patch_pos_embed = position_embeddings[:, 1:] + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( patch_pos_embed, - scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)), + size=(new_height, new_width), mode="bicubic", align_corners=False, ) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape From 9d751a6d15ef63f7a643be6344d429cccdc88f1d Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Sun, 15 Sep 2024 16:09:38 +0200 Subject: [PATCH 39/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From f36537b1bc7d2ec025a2e670401d3c88b9daa82b Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 18 Sep 2024 15:36:28 +0200 Subject: [PATCH 40/49] fixing interpolate_POS funciton --- src/transformers/models/altclip/modeling_altclip.py | 1 + src/transformers/models/bridgetower/modeling_bridgetower.py | 1 + src/transformers/models/chinese_clip/modeling_chinese_clip.py | 1 + src/transformers/models/clip/modeling_clip.py | 1 + src/transformers/models/clipseg/modeling_clipseg.py | 1 + src/transformers/models/git/modeling_git.py | 1 + src/transformers/models/kosmos2/modeling_kosmos2.py | 1 + src/transformers/models/x_clip/modeling_x_clip.py | 1 + 8 files changed, 8 insertions(+) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 984850250863..b3ce24e10b33 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -1024,6 +1024,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 05100382d9a3..c7b32de0d08e 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -295,6 +295,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 22e45108a0e7..393d5784bb44 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -200,6 +200,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 6f32e81b7133..370f17f47965 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -208,6 +208,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 356db0ddffd3..90520524fa88 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -175,6 +175,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 4fa925294b5a..fe56acb95ce9 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -643,6 +643,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 25557f0d676b..a4679e20cadf 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -416,6 +416,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 763c2d0b0e89..d05db378443e 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -133,6 +133,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: """ num_patches = embeddings.shape[1] - 1 + self.position_embeddings = self.position_embedding.weight.unsqueeze(0) num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes From 4170cbab50aa8d22e78888418efcf8a544f46fef Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 18 Sep 2024 21:40:09 +0200 Subject: [PATCH 41/49] adapting output tensor in teests --- tests/models/altclip/test_modeling_altclip.py | 2 +- tests/models/bridgetower/test_modeling_bridgetower.py | 2 +- tests/models/chinese_clip/test_modeling_chinese_clip.py | 2 +- tests/models/clip/test_modeling_clip.py | 2 +- tests/models/clipseg/test_modeling_clipseg.py | 2 +- tests/models/git/test_modeling_git.py | 2 +- tests/models/kosmos2/test_modeling_kosmos2.py | 2 +- tests/models/x_clip/test_modeling_x_clip.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 82b4f70c491a..f4ac29479c5f 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -632,7 +632,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.3671, -0.5896, 0.3435], [0.3136, 0.1141, 0.7695], [1.1259, -0.5578, 0.1346]] + [[-0.3589, -0.5939, 0.3534], [0.4346, 0.1647, 0.7071], [1.1404, -0.4716, 0.1664]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py index 5ae65820b72b..cceeee4912dc 100644 --- a/tests/models/bridgetower/test_modeling_bridgetower.py +++ b/tests/models/bridgetower/test_modeling_bridgetower.py @@ -686,7 +686,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.image_features.shape, expected_shape) expected_slice = torch.tensor( - [[-0.6931, 0.5243, -0.4443], [-2.5986, -0.0715, -0.4051], [-2.5374, -0.0969, -0.4116]] + [[-0.6518, 0.4978, -0.4544], [-2.6672, -0.0843, -0.4210], [-2.4510, -0.1002, -0.3458]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4)) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 072ce9b41735..647b3ac7b73a 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -772,7 +772,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.4035, 0.3008, -0.1223], [-0.1505, -0.2903, 0.0250], [-0.3128, -0.5132, 0.8456]] + [[-0.3990, 0.2983, -0.1239], [-0.1452, -0.2759, 0.0403], [-0.3149, -0.4763, 0.8555]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 05e5f2f7dcb9..c94aa0412653 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -1210,7 +1210,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.1599, 0.0276, -0.3315], [0.2613, 0.1183, -0.5668], [0.0244, 0.1978, -0.6078]] + [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index d724c6cef114..fb3a13e352ab 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -844,7 +844,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.1599, 0.0276, -0.3315], [0.2613, 0.1183, -0.5668], [0.0244, 0.1978, -0.6078]] + [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index ef732082d036..c0952940cfd2 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -594,7 +594,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-1.0274, 2.6038, 0.8594], [1.6899, 1.3264, -0.5352], [-1.4955, -0.1172, 0.0266]] + [[-1.0296, 2.5960, 0.8703], [1.7027, 1.3302, -0.4543], [-1.4932, -0.1084, 0.0502]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 47d10934edd1..913111c0a088 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -793,7 +793,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[1.0551, -1.1680, 3.2926], [2.7077, 0.0720, -0.7721], [1.5863, 0.1665, -0.5936]] + [[1.0022, -1.1901, 3.2887], [2.6164, 0.0515, -0.8270], [1.8315, 0.1272, -0.8590]] ).to(torch_device) self.assertTrue( diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 57139d288e4f..8b91019bae18 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -761,7 +761,7 @@ def test_inference_interpolate_pos_encoding(self): self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( - [[-0.0312, 0.2034, 0.0556], [0.0323, 0.5420, -0.1794], [-0.0952, 0.7817, -0.3186]] + [[0.0126, 0.2109, 0.0609], [0.0448, 0.5862, -0.1688], [-0.0881, 0.8525, -0.3044]] ).to(torch_device) self.assertTrue( From d00d7b356183c2f468fba6a012b128ef7547f5d2 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Tue, 24 Sep 2024 10:53:49 +0200 Subject: [PATCH 42/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From 44f969580299d4f934cf3419c3945a2d727a73c0 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Tue, 24 Sep 2024 17:02:39 +0200 Subject: [PATCH 43/49] modifying output tensor --- tests/models/clipseg/test_modeling_clipseg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index fb3a13e352ab..1eedf828db48 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -804,8 +804,9 @@ def test_inference_image_segmentation(self): torch.Size((3, 352, 352)), ) expected_masks_slice = torch.tensor( - [[-7.4729, -7.4890, -7.3732], [-7.3379, -7.1001, -7.1432], [-6.9942, -6.8002, -6.9010]] + [[-7.4613, -7.4785, -7.3627], [-7.3268, -7.0898, -7.1333], [-6.9838, -6.7900, -6.8913]] ).to(torch_device) + print(outputs.logits[0, :3, :3]) self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)) # verify conditional and pooled output From d70c2b376d7ef61ddebd077b717a0fa8897c4a23 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Tue, 24 Sep 2024 17:02:59 +0200 Subject: [PATCH 44/49] [run_slow] altclip, bridgetower, chinese_clip, clip, clipseg, git, kosmos2, x_clip From 299b979debf339e837a496af3732c3bd46f06658 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 25 Sep 2024 11:20:40 +0200 Subject: [PATCH 45/49] adding the correct tensor --- tests/models/clipseg/test_modeling_clipseg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 1eedf828db48..253f8412fe7a 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -806,12 +806,12 @@ def test_inference_image_segmentation(self): expected_masks_slice = torch.tensor( [[-7.4613, -7.4785, -7.3627], [-7.3268, -7.0898, -7.1333], [-6.9838, -6.7900, -6.8913]] ).to(torch_device) - print(outputs.logits[0, :3, :3]) + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)) # verify conditional and pooled output expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980]).to(torch_device) - expected_pooled_output = torch.tensor([0.5050, -0.2674, -0.2627]).to(torch_device) + expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device) self.assertTrue(torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)) self.assertTrue(torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)) From 55572b499ef7cf1f57605d24f2d8727d112f8da4 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 25 Sep 2024 11:21:34 +0200 Subject: [PATCH 46/49] [run_slow] clipseg From d121d89a26dabd6aa102ec42ec13f010ad3fd811 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 25 Sep 2024 11:24:37 +0200 Subject: [PATCH 47/49] fixing spaces --- tests/models/clipseg/test_modeling_clipseg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 253f8412fe7a..c5edf7cb757b 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -806,7 +806,7 @@ def test_inference_image_segmentation(self): expected_masks_slice = torch.tensor( [[-7.4613, -7.4785, -7.3627], [-7.3268, -7.0898, -7.1333], [-6.9838, -6.7900, -6.8913]] ).to(torch_device) - + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)) # verify conditional and pooled output From 7afedcfcbc6a269c4ece8fbaf90b0306a5d0b005 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 25 Sep 2024 11:25:24 +0200 Subject: [PATCH 48/49] [run_slow] clipseg From 3be2b60e4efaf0a8694310418b04404bc8371c09 Mon Sep 17 00:00:00 2001 From: Manuel Sanchez Hernandez Date: Wed, 25 Sep 2024 11:26:25 +0200 Subject: [PATCH 49/49] [run_slow] clipseg