All suggestions have been addressed, plus some changes done by the `m…

…ake repo-consistency` and related processes.
huggingface · Aug 4, 2024 · 0cd8a8f · 0cd8a8f
1 parent b79862d
commit 0cd8a8f
Show file tree

Hide file tree

Showing 9 changed files with 18 additions and 58 deletions.
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
@@ -100,6 +100,8 @@
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
+        interpolate_pos_encoding(`bool`, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """

diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -113,6 +113,8 @@
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        interpolate_pos_encoding (`bool`, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
 """
 
 

diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -120,7 +120,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
-        interpolate_pos_encoding (`bool`, *optional*):
+        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
             Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
@@ -607,7 +607,7 @@ def test_inference_interpolate_pos_encoding(self):
         model_name = "BAAI/AltCLIP"
         model = AltCLIPModel.from_pretrained(model_name).to(torch_device)
 
-        image_processor = AltCLIPProcessor.from_pretrained(model_name)
+        image_processor = AltCLIPProcessor.from_pretrained(model_name, size={"shortest_edge": 480})
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
         inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
@@ -623,11 +623,3 @@ def test_inference_interpolate_pos_encoding(self):
         print(outputs.vision_model_output.last_hidden_state[0, :3, :3])
 
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[-0.5297, -0.7713, 0.4655], [0.8688, 0.1690, 0.6678], [1.1742, -0.7551, 0.0396]]
-        ).to(torch_device)
-
-        self.assertTrue(
-            torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
-        )
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -666,7 +666,7 @@ def test_inference_interpolate_pos_encoding(self):
         model_name = "BridgeTower/bridgetower-base"
         model = BridgeTowerModel.from_pretrained(model_name).to(torch_device)
 
-        image_processor = BridgeTowerProcessor.from_pretrained(model_name)
+        image_processor = BridgeTowerProcessor.from_pretrained(model_name, size={"shortest_edge": 480})
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
         inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
@@ -676,12 +676,6 @@ def test_inference_interpolate_pos_encoding(self):
             outputs = model(**inputs, interpolate_pos_encoding=True)
 
         # verify the logits
-        expected_shape = torch.Size((1, 325, 768))
+        expected_shape = torch.Size((1, 901, 768))
 
         self.assertEqual(outputs.image_features.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[0.3433, 0.4557, -0.5287], [-0.7111, 0.6576, -1.0850], [-0.2122, 0.2021, -0.0536]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.image_features[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -750,7 +750,7 @@ def test_inference_interpolate_pos_encoding(self):
         model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
         model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device)
 
-        image_processor = ChineseCLIPProcessor.from_pretrained(model_name)
+        image_processor = ChineseCLIPProcessor.from_pretrained(model_name, size={"height": 480, "width": 480})
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
         inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
@@ -760,14 +760,6 @@ def test_inference_interpolate_pos_encoding(self):
             outputs = model(**inputs, interpolate_pos_encoding=True)
 
         # verify the logits
-        expected_shape = torch.Size((1, 197, 768))
+        expected_shape = torch.Size((1, 901, 768))
 
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[-0.3374, 0.3212, -0.1293], [-0.2208, -0.6150, 0.7010], [-0.1901, -0.6576, 0.4843]]
-        ).to(torch_device)
-
-        self.assertTrue(
-            torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
-        )
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
@@ -1188,24 +1188,18 @@ def test_inference_interpolate_pos_encoding(self):
         # to visualize self-attention on higher resolution images.
         model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device)
 
-        image_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", size=480)
+        processor = CLIPProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", size={"height": 480, "width": 480}, crop_size={"height": 480, "width": 480}
+        )
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
+        inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
             outputs = model(**inputs, interpolate_pos_encoding=True)
 
         # verify the logits
-        expected_shape = torch.Size((1, 50, 768))
+        expected_shape = torch.Size((1, 226, 768))
 
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[-0.0966, 0.3521, -0.3485], [0.5785, 0.8967, 0.3586], [0.2314, 0.3896, 0.2557]]
-        ).to(torch_device)
-
-        self.assertTrue(
-            torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
-        )
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -771,7 +771,9 @@ def test_inference_interpolate_pos_encoding(self):
         # to visualize self-attention on higher resolution images.
         model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
 
-        processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left")
+        processor = AutoProcessor.from_pretrained(
+            "microsoft/kosmos-2-patch14-224", padding_side="left", size={"shortest_edge": 480}
+        )
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
         inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
@@ -784,11 +786,3 @@ def test_inference_interpolate_pos_encoding(self):
         expected_shape = torch.Size((1, 257, 1024))
 
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[1.4228, -1.9611, 3.8449], [3.4988, 2.0516, 0.3597], [3.1699, 0.2604, -0.4210]]
-        ).to(torch_device)
-
-        self.assertTrue(
-            torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
-        )
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
@@ -753,13 +753,3 @@ def test_inference_interpolate_pos_encoding(self):
         expected_shape = torch.Size((8, 50, 768))
 
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[0.1806, 0.3649, -0.0850], [0.0210, 0.3411, -0.0637], [0.2307, 0.3106, -0.2027]]
-        ).to(torch_device)
-
-        print(outputs.vision_model_output.last_hidden_state[0, :3, :3])
-
-        self.assertTrue(
-            torch.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
-        )