From bfe1a58a879d010e8a83bd224570a451ae1bfae5 Mon Sep 17 00:00:00 2001
From: simonJJJ <821898965@qq.com>
Date: Wed, 28 Aug 2024 17:39:57 +0800
Subject: [PATCH 1/8] qwen2vl_align_kv_seqlen_to_qwen2

---
 .../models/qwen2_vl/modeling_qwen2_vl.py      | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 6ab813ad9ade91..5e7919a95a7dce 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -550,8 +550,13 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += cache_position[0] + 1
-
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_multimodal_rotary_pos_emb(
             query_states, key_states, cos, sin, position_ids, self.rope_scaling["mrope_section"]
@@ -632,10 +637,19 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += cache_position[0] + 1
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 
         # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = cache_position[-1]
+        rotary_seq_len = (
+            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
+        )
+
         cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
 
         query_states, key_states = apply_multimodal_rotary_pos_emb(

From c99c6bff298c7745d513628112e9c101fe810c2c Mon Sep 17 00:00:00 2001
From: "baishuai.bs" <1051314669@qq.com>
Date: Thu, 5 Sep 2024 15:47:28 +0800
Subject: [PATCH 2/8] flash att test

---
 .../models/qwen2_vl/test_modeling_qwen2_vl.py | 134 +++++++++++++-----
 1 file changed, 99 insertions(+), 35 deletions(-)

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index ba3b30d94533e0..ec6444331ed2d7 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -28,7 +28,9 @@
 )
 from transformers.testing_utils import (
     require_bitsandbytes,
+    require_flash_attn,
     require_torch,
+    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -334,12 +336,12 @@ def test_small_model_integration_test(self):
 
         expected_pixel_slice = torch.tensor(
             [
-                [0.8501, 0.8647, 0.8647],
-                [1.0106, 1.0106, 1.0252],
-                [0.9960, 1.0106, 1.0252],
-                [1.0982, 1.1128, 1.1274],
-                [1.0836, 1.0982, 1.0982],
-                [1.1858, 1.1858, 1.1858],
+                [0.8501, 0.8501, 0.8647],
+                [0.9376, 0.9376, 0.9376],
+                [0.9084, 0.9376, 0.9376],
+                [1.0252, 1.0252, 1.0544],
+                [1.0252, 1.0252, 1.0252],
+                [1.0836, 1.0836, 1.0836],
             ],
             dtype=torch.float32,
             device="cpu",
@@ -350,7 +352,7 @@ def test_small_model_integration_test(self):
         inputs = inputs.to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=30)
-        EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?assistant\nThe dog in the picture appears to be a Labrador Retriever or a similar breed. Labradors are known for their friendly and intelligent nature,"
+        EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices"
 
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
@@ -392,57 +394,119 @@ def test_small_model_integration_test_batch_wo_image(self):
             {"role": "user", "content": "Who are you?"},
         ]
         text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
-        inputs = self.processor(text=[text, text2], images=[self.image], return_tensors="pt").to(torch_device)
+        inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
+            torch_device
+        )
 
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?assistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and outgoing personalities, as well as their",
-            "system\nYou are a helpful assistant.user\nWho are you?assistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with various tasks and answer a wide range of questions to",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to answer a wide range of questions and provide information on various topics",
         ]
 
         self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
+            self.processor.batch_decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT[0],
+        )
+        self.assertEqual(
+            self.processor.batch_decode(output[1], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT[1],
         )
 
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batch_different_resolutions(self):
         model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", load_in_4bit=True)
-        text, vision_infos = self.processor.apply_chat_template(
-            self.messages, tokenize=False, add_generation_prompt=True
+        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
+        text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
+        image2 = self.image.resize((224, 224))
+        inputs = self.processor(text=[text, text2], images=[self.image, image2], padding=True, return_tensors="pt").to(
+            torch_device
+        )
+
+        # it should not matter whether two images are the same size or not
+        output = model.generate(**inputs, max_new_tokens=30)
+
+        EXPECTED_DECODED_TEXT = [
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Golden Retriever. Golden Retrievers are known for their friendly and affectionate nature, as well as",
+        ]
+        self.assertEqual(
+            self.processor.batch_decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT[0],
+        )
+        self.assertEqual(
+            self.processor.batch_decode(output[1], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT[1],
+        )
+
+    @slow
+    @require_flash_attn
+    @require_torch_gpu
+    def test_small_model_integration_test_batch_flashatt2(self):
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct",
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map="auto",
+        )
+        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
+            torch_device
+        )
+
+        # it should not matter whether two images are the same size or not
+        output = model.generate(**inputs, max_new_tokens=30)
+
+        EXPECTED_DECODED_TEXT = [
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+        ]
+
+        self.assertEqual(
+            self.processor.batch_decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT[0],
+        )
+        self.assertEqual(
+            self.processor.batch_decode(output, skip_special_tokens=True)[0],
+            self.processor.batch_decode(output, skip_special_tokens=True)[1],
         )
+
+    @slow
+    @require_flash_attn
+    @require_torch_gpu
+    def test_small_model_integration_test_batch_wo_image_flashatt2(self):
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct",
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map="auto",
+        )
+        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
         messages2 = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
-                        "resized_height": 504,
-                        "resized_width": 252,
-                    },
-                    {"type": "text", "text": "What kind of dog is this?"},
-                ],
-            }
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who are you?"},
         ]
-        text2, vision_infos2 = self.processor.apply_chat_template(
-            messages2, tokenize=False, add_generation_prompt=True
+        text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
+            torch_device
         )
-        inputs = self.processor(
-            text=[text, text2], vision_infos=[vision_infos, vision_infos2], return_tensors="pt"
-        ).to(torch_device)
 
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?assistant\nThe dog in the picture appears to be a Labrador Retriever or a similar breed. Labradors are known for their friendly and intelligent nature,",
-            "system\nYou are a helpful assistant.\nuser\nWho are you?assistant\nI am a large language model created by Alibaba Cloud. I am called Qwen.",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to answer a wide range of questions and provide information on various topics",
         ]
+
         self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
+            self.processor.batch_decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT[0],
+        )
+        self.assertEqual(
+            self.processor.batch_decode(output, skip_special_tokens=True)[0],
+            self.processor.batch_decode(output, skip_special_tokens=True)[1],
         )

From 35f0e2d5c907b24887e9723d122a3fccf5a12299 Mon Sep 17 00:00:00 2001
From: ShuaiBai623 <baishuai623@icloud.com>
Date: Thu, 5 Sep 2024 17:05:54 +0800
Subject: [PATCH 3/8] [run-slow] qwen2_vl

---
 .../models/qwen2_vl/test_modeling_qwen2_vl.py | 28 ++++++-------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index ec6444331ed2d7..9a1938d905b11c 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -407,12 +407,8 @@ def test_small_model_integration_test_batch_wo_image(self):
         ]
 
         self.assertEqual(
-            self.processor.batch_decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT[0],
-        )
-        self.assertEqual(
-            self.processor.batch_decode(output[1], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT[1],
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
         )
 
     @slow
@@ -434,12 +430,8 @@ def test_small_model_integration_test_batch_different_resolutions(self):
             "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Golden Retriever. Golden Retrievers are known for their friendly and affectionate nature, as well as",
         ]
         self.assertEqual(
-            self.processor.batch_decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT[0],
-        )
-        self.assertEqual(
-            self.processor.batch_decode(output[1], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT[1],
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
         )
 
     @slow
@@ -466,8 +458,8 @@ def test_small_model_integration_test_batch_flashatt2(self):
         ]
 
         self.assertEqual(
-            self.processor.batch_decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT[0],
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
         )
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True)[0],
@@ -503,10 +495,6 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         ]
 
         self.assertEqual(
-            self.processor.batch_decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT[0],
-        )
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True)[0],
-            self.processor.batch_decode(output, skip_special_tokens=True)[1],
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
         )

From ba455d351dc8fd3349332d2bcfba92491a869412 Mon Sep 17 00:00:00 2001
From: ShuaiBai623 <baishuai623@icloud.com>
Date: Thu, 5 Sep 2024 18:53:32 +0800
Subject: [PATCH 4/8] [run-slow] qwen2_vl fix OOM

---
 .../models/qwen2_vl/test_modeling_qwen2_vl.py | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 9a1938d905b11c..13c8c904927c0e 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -27,7 +27,6 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
-    require_bitsandbytes,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -313,7 +312,7 @@ def setUp(self):
                 ],
             }
         ]
-        url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+        url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
         self.image = Image.open(requests.get(url, stream=True).raw)
 
     def tearDown(self):
@@ -321,11 +320,9 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     @slow
-    @require_bitsandbytes
     def test_small_model_integration_test(self):
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct",
-            load_in_4bit=True,
+            "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
         )
 
         text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
@@ -336,17 +333,17 @@ def test_small_model_integration_test(self):
 
         expected_pixel_slice = torch.tensor(
             [
-                [0.8501, 0.8501, 0.8647],
-                [0.9376, 0.9376, 0.9376],
-                [0.9084, 0.9376, 0.9376],
-                [1.0252, 1.0252, 1.0544],
-                [1.0252, 1.0252, 1.0252],
-                [1.0836, 1.0836, 1.0836],
+                [0.8792, 0.8792, 0.9084],
+                [1.1858, 1.1858, 1.2296],
+                [1.2004, 1.2004, 1.2150],
+                [1.4340, 1.4340, 1.4194],
+                [1.3902, 1.4048, 1.4194],
+                [1.5216, 1.5362, 1.5362],
             ],
             dtype=torch.float32,
             device="cpu",
         )
-        assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-3)
+        assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
 
         # verify generation
         inputs = inputs.to(torch_device)
@@ -360,9 +357,10 @@ def test_small_model_integration_test(self):
         )
 
     @slow
-    @require_bitsandbytes
     def test_small_model_integration_test_batch(self):
-        model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", load_in_4bit=True)
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+        )
         text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
         inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
             torch_device
@@ -372,8 +370,8 @@ def test_small_model_integration_test_batch(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?assistant\nThe dog in the picture appears to be a Labrador Retriever or a similar breed. Labradors are known for their friendly and intelligent nature,",
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?assistant\nThe dog in the image appears to be a Labrador Retriever or a similar breed. Labradors are known for their friendly and outgoing nature,",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
         ]
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
@@ -385,9 +383,10 @@ def test_small_model_integration_test_batch(self):
         )
 
     @slow
-    @require_bitsandbytes
     def test_small_model_integration_test_batch_wo_image(self):
-        model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", load_in_4bit=True)
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+        )
         text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
         messages2 = [
             {"role": "system", "content": "You are a helpful assistant."},
@@ -412,9 +411,10 @@ def test_small_model_integration_test_batch_wo_image(self):
         )
 
     @slow
-    @require_bitsandbytes
     def test_small_model_integration_test_batch_different_resolutions(self):
-        model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", load_in_4bit=True)
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+        )
         text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
         text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
         image2 = self.image.resize((224, 224))
@@ -427,7 +427,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
 
         EXPECTED_DECODED_TEXT = [
             "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Golden Retriever. Golden Retrievers are known for their friendly and affectionate nature, as well as",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
         ]
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),

From ca2828cf600da320b6bbd6d49897810263bd48bd Mon Sep 17 00:00:00 2001
From: ShuaiBai623 <baishuai623@icloud.com>
Date: Thu, 5 Sep 2024 20:20:30 +0800
Subject: [PATCH 5/8] [run-slow] qwen2_vl

---
 .../models/qwen2_vl/test_modeling_qwen2_vl.py  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 13c8c904927c0e..6673ee0745e04c 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -349,7 +349,7 @@ def test_small_model_integration_test(self):
         inputs = inputs.to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=30)
-        EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices"
+        EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets"
 
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
@@ -370,8 +370,8 @@ def test_small_model_integration_test_batch(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
         ]
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
@@ -401,7 +401,7 @@ def test_small_model_integration_test_batch_wo_image(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
             "system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to answer a wide range of questions and provide information on various topics",
         ]
 
@@ -426,8 +426,8 @@ def test_small_model_integration_test_batch_different_resolutions(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
         ]
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
@@ -453,8 +453,8 @@ def test_small_model_integration_test_batch_flashatt2(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
         ]
 
         self.assertEqual(
@@ -490,7 +490,7 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
+            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
             "system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to answer a wide range of questions and provide information on various topics",
         ]
 

From 00a141549bddcdc3190099a7c77636e817fd6002 Mon Sep 17 00:00:00 2001
From: ShuaiBai623 <43326198+ShuaiBai623@users.noreply.github.com>
Date: Thu, 5 Sep 2024 23:22:38 +0800
Subject: [PATCH 6/8] Update tests/models/qwen2_vl/test_modeling_qwen2_vl.py

Co-authored-by: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
---
 tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 6673ee0745e04c..0be3608a00795f 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -404,7 +404,9 @@ def test_small_model_integration_test_batch_wo_image(self):
             "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
             "system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to answer a wide range of questions and provide information on various topics",
         ]
-
+            'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets',
+            'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with various tasks and answer questions to the best of my'
+        ] # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,

From 2094e397807b9e6b53089b8e7589322cc5ac12da Mon Sep 17 00:00:00 2001
From: ShuaiBai623 <43326198+ShuaiBai623@users.noreply.github.com>
Date: Thu, 5 Sep 2024 23:22:55 +0800
Subject: [PATCH 7/8] Update tests/models/qwen2_vl/test_modeling_qwen2_vl.py

Co-authored-by: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
---
 tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 0be3608a00795f..46360d939c9fe0 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -373,6 +373,9 @@ def test_small_model_integration_test_batch(self):
             "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
             "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
         ]
+            'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
+            'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets'
+        ] # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,

From 661b148389d997d513d60d21bd429491b9f82a98 Mon Sep 17 00:00:00 2001
From: ShuaiBai623 <baishuai623@icloud.com>
Date: Thu, 5 Sep 2024 23:28:18 +0800
Subject: [PATCH 8/8] code quality

---
 tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 46360d939c9fe0..536e0ab54abc45 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -370,20 +370,13 @@ def test_small_model_integration_test_batch(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
-        ]
             'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
             'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets'
-        ] # fmt: skip
+        ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
         )
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True)[0],
-            self.processor.batch_decode(output, skip_special_tokens=True)[1],
-        )
 
     @slow
     def test_small_model_integration_test_batch_wo_image(self):
@@ -404,12 +397,9 @@ def test_small_model_integration_test_batch_wo_image(self):
         output = model.generate(**inputs, max_new_tokens=30)
 
         EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
-            "system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to answer a wide range of questions and provide information on various topics",
-        ]
             'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets',
             'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with various tasks and answer questions to the best of my'
-        ] # fmt: skip
+        ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,